In [1]:
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas/')
sys.path.insert(1,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from metatlas import metatlas_objects as metob
from metatlas.helpers import pactolus_tools as pt
from metatlas.helpers import dill2plots as dp
import numpy as np
import glob as glob
import os
import pandas as pd
from matplotlib import pyplot as plt
# %matplotlib notebook
%matplotlib inline
from pprint import pprint
from matplotlib import collections as mplib_collections
import matplotlib.colors as mpl_colors
Through whatever means, get a list of paths to hdf5 metatlas files. These files will be read using metatlas data access tools and formated for Pactolus.
Typically you will either:
Regardless, preparing the list of full paths to the hdf5 metatlas files is a necessary first step.
In [ ]:
%system ls -ltr $SCRATCH
In [ ]:
myfiles = ['/project/projectdirs/metatlas/raw_data/kblouie/20150914_actinorhodin_finalset_50mm/20150910_C18_MeOH_NEG_MSMS_Scoelicolor_media_WT_M145_Day6_3of4___Run61.h5',
'/project/projectdirs/metatlas/raw_data/kblouie/20150914_actinorhodin_finalset_50mm/20150910_C18_MeOH_NEG_MSMS_Scoelicolor_media_Ref_TwoNine_Day6_1of4___Run65.h5',
'/project/projectdirs/metatlas/raw_data/kblouie/20150914_actinorhodin_finalset_50mm/20150910_C18_MeOH_NEG_MSMS_Scoelicolor_media_Del_TwoFour_Day6_3of4___Run21.h5']
my_groups = ['wild type','refactored','deleted']
In [ ]:
#20160413_Bhedlund_pHILIC_POS_JAD2_GBS_Set1
# EMA_%_QE144_50447_20170120
temp_group = dp.select_groups_for_analysis(name = '%UV_Fungus_1to10%',
most_recent = True,
remove_empty = True,#'Strain=SB214'
include_list = [], exclude_list = ['QC','Blank'])
# temp_group = metob.retrieve('Groups', name = '20160413%GBS%Set1', username='*')
my_files = []
my_groups = []
for i,g in enumerate(temp_group):
# if not '32' in g.name:
# if not '4O' in g.name:
print g.name#,g.last_modified
if (len(g.items) > 0):
for f in g.items:
#print f.hdf5_file
my_files.append(f.hdf5_file)
my_groups.append(g.name)
#for f in my_files:
# print f
In [2]:
my_run = metob.retrieve('LcmsRun',experiment='20170512_SK_-MR_SupprSoils_EthylAc2',name='%', username='*')
my_files = []
for m in my_run:
if (not '_qc-' in m.hdf5_file.lower()) & (not 'injbl' in m.hdf5_file.lower()):
my_files.append( m.hdf5_file )
my_files = np.unique(my_files)
print len(my_files)
f = [os.path.basename(f) for f in my_files]
for ff in f:
print ff
In [ ]:
import pandas as pd
# files = pd.read_csv('phylogenetic c18 files.csv')
files = pd.read_excel('all cameron files_RCC_selected (2).xls')
# print files.head()
all_myfiles = files['full path'].tolist()
pat = 'P_halotolerans'
myfiles = []
for f in all_myfiles:
if pat in f:
myfiles.append(f)
print len(myfiles)
print myfiles
The Pactolus jobs can create a large amount of temporary results. These will likely fill your home directory.
I usually create my temporary storage here:
target_dir = '/project/projectdirs/openmsi/projects/ben_run_pactolus/test_tools_4'
Replace test_tools_4
with an appropriately named temporary folder.
In [ ]:
root_path = %system echo $SCRATCH
root_path = root_path[0]
# root_path = root_path.replace('/cscratch1/sd','/scratch2/scratchdirs').replace('/global','')
root_path
In [3]:
root_path = '/project/projectdirs/metatlas/projects/jgi_projects/'
In [4]:
# target_dir = os.path.join(root_path,'pactolus_runs','20161011_actinorhodin')
target_dir = os.path.join(root_path,'Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2')
if not os.path.isdir(target_dir):
os.mkdir(target_dir)
target_dir
# '/project/projectdirs/openmsi/projects/ben_run_pactolus/20161011_actinorhodin/'
Out[4]:
In [5]:
# %cat $target_dir/20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-NEG_MSMLS-P2-RD_IR2_31_31.h5_polarity_0.sbatch
In [6]:
%system ls $target_dir
Out[6]:
In [7]:
pt = reload(pt)
pt.create_pactolus_msms_data_container(my_files,target_dir,3e5,min_rt = 0.5, max_rt = 30.0,make_container=True)
Out[7]:
In [ ]:
# %system cat /global/cscratch1/sd/bpb/pactolus_runs/Cori_20161031_KBL_C18_HO_SecMet_Plate1to3/*.sbatch
In [8]:
with open('/global/homes/b/bpb/Downloads/pactolus_jobs_20170512_SK_-MR_SupprSoils_EthylAc2.sh','w') as fid:
check_path = os.path.join(target_dir,'*.sbatch')
sbatch_files = %system ls $check_path
for s in sbatch_files:
fid.write('%s %s\n'%('sbatch',s))
In [ ]:
len(sbatch_files)
In [ ]:
for sf in sbatch_files:
print 'sbatch',sf
In [ ]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
df = %squeue -u bpb -p realtime
print len(myfiles), df.shape
In [ ]:
from IPython.display import HTML
import pandas as pd
h = HTML(df.to_html())
with open('current_jobs.html', 'w') as my_file:
my_file.write(h.data)
In [ ]:
pt = reload(pt)
mydirs = ['rexmalm_pos_pellet']
for d in mydirs:
root_path = %system echo $SCRATCH
root_path = root_path[0]
target_dir = os.path.join(root_path,'pactolus_runs',d)
print d
my_session = pt.submit_all_jobs(target_dir)#,usr='bpb',pwd='Bron$e00',)
In [ ]:
pt = reload(pt)
jobs = pt.check_job_status()
In [ ]:
%system ls /project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/*.sbatch
In [ ]:
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/pactolus/')
from pactolus import score_frag_dag_wip as sfd
sys.path.insert(1,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from metatlas import metatlas_objects as metob
import pandas as pd
In [ ]:
q = """
select rtreferences.rt_peak, rtreferences.rt_min, rtreferences.rt_max, rtreferences.rt_units, rtreferences.last_modified, rtreferences.username
from rtreferences, compoundidentifications, compoundidentifications_rt_references, compoundidentifications_compound, compounds
where
rtreferences.unique_id = compoundidentifications_rt_references.target_id and
compoundidentifications.unique_id = compoundidentifications_rt_references.source_id and
compoundidentifications.unique_id = compoundidentifications_compound.source_id and
compounds.unique_id = compoundidentifications_compound.target_id and
compounds.inchi_key = 'MTCFGRXMJLQNBG-REOHCLBHSA-N'
"""
result = [e for e in metob.database.query(q)]
In [ ]:
df = pd.DataFrame(entries)
df.head()
In [ ]:
# fragmentationreferences.precursor_mz, fragmentationreferences.polarity, fragmentationreferences.collision_energy, fragmentationreferences.technique, fragmentationreferences.username
In [ ]:
q = """
select fragmentationreferences.precursor_mz, fragmentationreferences.polarity, fragmentationreferences.collision_energy, fragmentationreferences.technique, fragmentationreferences.username
from fragmentationreferences, compoundidentifications, compoundidentifications_frag_references, compoundidentifications_compound, compounds
where
fragmentationreferences.unique_id = compoundidentifications_frag_references.target_id and
compoundidentifications.unique_id = compoundidentifications_frag_references.source_id and
compoundidentifications.unique_id = compoundidentifications_compound.source_id and
compounds.unique_id = compoundidentifications_compound.target_id and
compounds.inchi_key = 'MTCFGRXMJLQNBG-REOHCLBHSA-N'
"""
result = [e for e in metob.database.query(q)]
In [ ]:
df = pd.DataFrame(entries)
df.head()
In [ ]:
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/pactolus/')
from pactolus import score_frag_dag_wip as sfd
In [ ]:
sfd = reload(sfd)
container_file = '/project/projectdirs/metatlas/projects/pactolus_runs/20170425_EMA_For_MAGI_PAPER/container_file_polarity_1.h5'
sample_file = '20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-POS_MSMLS-PKZ-R9_IR1_148_148.h5'
# input_file = container_file + ':' + sample_file
tree_file = '/project/projectdirs/metatlas/projects/clean_pactolus_trees/tree_lookup.npy'
out_file = '/project/projectdirs/metatlas/projects/magi_paper/test_pactolus.h5'
#results is just the score matrix
results = sfd.score_main(use_command_line=False,
ms1_mass_tolerance=0.01,
ms2_mass_tolerance=0.01,
max_depth=3,
neutralizations=[-1.00727646677,-2.0151015067699998,0.00054857990946],
schedule='',
trees=tree_file,
metabolite_database=None,
input_filepath=container_file,
input_grouppath=sample_file,
output_filepath=out_file,
output_grouppath='asdf',
match_matrix=False,
precursor_mz=-1,
pass_scanmeta=True,
pass_scans=True,
loglevel='ERROR', #use ERROR,INFO, or DEBUG
tempdir='/project/projectdirs/metatlas/projects/pactolus_tempdir',
clean_tempdir=True,
clean_output=True,
pass_compound_meta=False,
start_barrier=False)
# This creates a temp file and that temp file is then converted into the standard pactolus file
In [ ]:
%system mkdir /project/projectdirs/metatlas/projects/pactolus_tempdir
In [ ]:
# jobs = %system cat /project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/*.sbatch | grep 'srun -n 32 -N 1'
# for j in jobs:
# print j.replace('srun -n 32 -N 1 python','%run').replace('/tmp/packages/pactolus/','/project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/')
# print ''
In [ ]:
# %run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py --clean_tempdir True --loglevel DEBUG --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/container_file_polarity_1.h5:20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_7_HighExp_Media_Fungus__1001_A__Run42.h5" --neutralizations "[-1.00727646677,-2.0151015067699998,0.00054857990946]" --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy --ms1_mass_tolerance 0.025 --ms2_mass_tolerance 0.025 --max_depth 5 --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_7_HighExp_Media_Fungus__1001_A__Run42.h5" --precursor_mz -1 --match_matrix False --clean_output True
In [ ]:
# sys.path.insert(1,'/global/homes/b/bpb/metaiq/pactolus/')
# import no_bastet_score_frag_dag as pactolus
In [ ]:
# scan_list, scan_metadata, experiment_metadata = load_scan_data_hdf5(filepath=input_filepath,
# grouppath=input_grouppath)
# # # Determine the ms1_mz, i.e., the precursor m/z
# # ms1_mz = scan_metadata['ms1_mz'] if 'ms1_mz' in scan_metadata else None
# # file_lookup_table = load_file_lookup_table(path=trees)
# # # size input variables
# # num_scans = len(scan_list)
# # num_compounds = len(file_lookup_table)
# # results = score_scan_list_against_trees(scan_list=scan_list,
# # ms1_mz=ms1_mz,
# # file_lookup_table=file_lookup_table,
# # neutralizations=neutralizations,
# # ms2_mass_tol=ms2_mass_tol,
# # ms1_mass_tol=ms1_mass_tol,
# # max_depth=max_depth,
# # temp_out_group=temp_out_group,
# # want_match_matrix=match_matrix,
# # )
In [ ]:
# polarity = 0
# import os
# sfd = '/global/homes/b/bpb/metaiq/pactolus/score_frag_dag.py'
# cf = os.path.join(target_dir,'container_file_polarity_%d.h5'%polarity)
# import h5py
# with h5py.File(cf,'r') as hf:
# dfs = hf.keys()
# ofs = [os.path.join(target_dir,'pactolus_results_%s'%f) for f in dfs]
# of = ofs[0]
# df = dfs[0]
# tp = '/project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy'
# pos_neut = '[-1.00727646677,-2.0151015067699998,0.00054857990946]'
# neg_neut = '[+1.00727646677,+2.0151015067699998,-0.00054857990946]'
In [ ]:
# %system cat $sfd
# %system ls -lt $target_dir
In [ ]:
# %system ps -lu bpb
In [ ]:
# print job_script
In [ ]:
# params = (sfd,cf,df,neg_neut,tp,of)
# #job_script = '%s --input "%s:%s" --neutralizations "%s" --trees %s --ms1_mass_tolerance 0.015 --ms2_mass_tolerance 0.015 --max_depth 5 --save "%s" --match_matrix False'%params
# job_script = '%s --clean_tempdir True --loglevel DEBUG --input "%s:%s" --neutralizations "%s" --trees %s --ms1_mass_tolerance 0.015 --ms2_mass_tolerance 0.015 --max_depth 5 --save "%s" --precursor_mz -1 --match_matrix False --clean_output True'%params
# %run $job_script
In [ ]:
# %tb
In [ ]:
# %run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py --clean_tempdir True --loglevel DEBUG --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/container_file_polarity_1.h5:20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_6_HighExp_Media_Fungus__1001_A__Run30.h5" --neutralizations "[-1.00727646677,-2.0151015067699998,0.00054857990946]" --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy --ms1_mass_tolerance 0.025 --ms2_mass_tolerance 0.025 --max_depth 5 --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_6_HighExp_Media_Fungus__1001_A__Run30.h5" --precursor_mz -1 --match_matrix False --clean_output True
In [ ]:
# %run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py --clean_tempdir True --loglevel DEBUG --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/container_file_polarity_1.h5:20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_7_HighExp_Media_Fungus__1001_A__Run42.h5" --neutralizations "[-1.00727646677,-2.0151015067699998,0.00054857990946]" --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy --ms1_mass_tolerance 0.025 --ms2_mass_tolerance 0.025 --max_depth 5 --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_7_HighExp_Media_Fungus__1001_A__Run42.h5" --precursor_mz -1 --match_matrix False --clean_output True
In [ ]:
# #%run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py --clean_tempdir True --loglevel DEBUG --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/20160727_OE_TURNBAUGH_Potato2_neg/container_file_polarity_0.h5:20160726_SK-OE_Turnbaugh_Swtpotato2_QE119_C18-R15180_NEG_raw-BR3_IR2_28.h5" --neutralizations "[1.00727646677,2.0151015067699998,-0.00054857990946]" --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy --ms1_mass_tolerance 0.025 --ms2_mass_tolerance 0.025 --max_depth 5 --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/20160727_OE_TURNBAUGH_Potato2_neg/pactolus_results_20160726_SK-OE_Turnbaugh_Swtpotato2_QE119_C18-R15180_NEG_raw-BR3_IR2_28.h5" --precursor_mz -1 --match_matrix False --clean_output True
# %run /project/projectdirs/openmsi/projects/ben_run_pactolus/metaiq/pactolus/score_frag_dag.py
# --clean_tempdir True
# --loglevel DEBUG
# --input "/project/projectdirs/openmsi/projects/ben_run_pactolus/20160727_OE_TURNBAUGH_Potato2_neg/container_file_polarity_0.h5:20160726_SK-OE_Turnbaugh_Swtpotato2_QE119_C18-R15180_NEG_cooked-BR1_IR2_10.h5"
# --neutralizations "[1.00727646677,2.0151015067699998,-0.00054857990946]"
# --trees /project/projectdirs/openmsi/projects/ben_trees/metacyc_max_depth_5.npy
# --ms1_mass_tolerance 0.025
# --ms2_mass_tolerance 0.025
# --max_depth 5
# --save "/project/projectdirs/openmsi/projects/ben_run_pactolus/20160727_OE_TURNBAUGH_Potato2_neg/pactolus_results_20160726_SK-OE_Turnbaugh_Swtpotato2_QE119_C18-R15180_NEG_cooked-BR1_IR2_10.h5"
# --precursor_mz -1 --match_matrix False --clean_output True
In [ ]:
f = pt.check_for_failed_jobs(target_dir)
In [ ]:
#pt = reload(pt)
#metatlas_name,neutral_inchi,neutral_mass =pt.get_neutral_inchi_and_name(use_pickle=True)
In [ ]:
# target_dir = '/project/projectdirs/openmsi/projects/ben_run_pactolus/HighExp_Media_Fungus_pos/'
In [ ]:
import glob as glob
import os
import h5py
files = glob.glob(os.path.join(target_dir,'*.h5'))
for i,f in enumerate(files):
print 'File=%d\t%s'%(i,os.path.basename(f))
with h5py.File(f,'r') as output_file:
my_keys = output_file.keys()
# counter = 0
# for k in my_keys:
# if 'match_matrix' not in k:
# counter = counter +1
# try:
# score_matrix = output_file['score_matrix'][:]
# num = score_matrix.shape[0]
# except:
# num = 0
# print 'File=%d\tKeys=%d\tScans=%d\t%s'%(i,counter,num,os.path.basename(f))
In [ ]:
with h5py.File(os.path.join(target_dir,'pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_6_HighExp_Media_Fungus__1001_A__Run30.h5')) as output_file:
print output_file.keys()
#[a[0] for a in output_file['tree_file_lookup_table']]
In [ ]:
pt = reload(pt)
all_dfs = pt.make_output_tables(target_dir)
#,metatlas_name,neutral_inchi,neutral_mass,score_cutoff=0.001,intensity_min=1e5,to_excel=True)
#TODO: This takes a surprisingly long time to run for each file.
In [ ]:
pt = reload(pt)
my_sheets = glob.glob(os.path.join(target_dir,'pactolus_results_*.csv'))
print my_sheets
df_all_files = pt.merge_sheets(my_sheets)
my_piv=df_all_files.pivot(columns='source_file')
In [ ]:
ref_df = pd.read_pickle('/project/projectdirs/openmsi/projects/ben_run_pactolus/unique_compounds.pkl')
#df = pd.read_csv('/project/projectdirs/openmsi/projects/compound_data/jgi_molecules/new_jgi_compounds.csv')
# df.rename(columns = {'monoisotopoic_mw':'monoisotopic_mw'},inplace=True)
print ref_df.keys()
print df_all_files.keys()
print df_all_files.shape
In [ ]:
temp= df_all_files.reset_index()
# pd.merge(restaurant_ids_dataframe, restaurant_review_frame, on='business_id', how='outer')
big_data = pd.merge(temp,ref_df,on='inchi_key')#.reset_index()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 40)
print big_data.shape
big_data.head(10)
In [ ]:
big_data.to_csv(os.path.join(target_dir,'named_pactolus_all_results_out.csv'))#, index=False)
In [ ]:
my_piv.to_csv(os.path.join(target_dir,'pactolus_pivot_out.csv'))#, index=False)
df_all_files.to_csv(os.path.join(target_dir,'pactolus_all_results_out.csv'))#, index=False)
In [ ]:
my_piv.head()
In [ ]:
import time
import os
def make_tarball(output_dir,do_timestr=True,csv_only=False):
if do_timestr:
timestr = time.strftime("%Y%m%d-%H%M%S") + '_'
else:
timestr = ''
tarball_name = timestr + os.path.basename(os.path.normpath(output_dir)) + '.tar.gz'
if not csv_only:
%system tar -zcf $tarball_name -C $output_dir .
else:
temp = os.path.join(output_dir,'*.csv')
%system tar -zcf $tarball_name $temp
print 'done'
from IPython.core.display import display, HTML
f1 = os.path.join(os.getcwd(), tarball_name).replace('/global/u2/b/bpb','/user/bpb/notebooks')
print f1
f2 = tarball_name
display(HTML('<a href="%s" download="%s">Start automatic download!</a>'%(f1,f2)))
make_tarball(target_dir,csv_only=True)
# https://jupyter-dev.nersc.gov/user/bpb/files/notebooks/Pactolus%20Tools/20161108-131105_20161011_actinorhodin.tar.gz
In [ ]:
intensity = my_piv['precursor intensity']
files = intensity.keys()
print files
# plt.plot(intensity[files[2]],intensity[files[1]],'.')
# ax = plt.gca()
# ax.set_xscale('log')
# ax.set_yscale('log')
# ax.set_xlabel(files[0])
# ax.set_ylabel(files[1])
# plt.show()
In [ ]:
my_piv = pd.read_csv(os.path.join(target_dir,'pactolus_pivot_out.csv'))
In [ ]:
# pt = reload(pt)
# network = pt.import_network()
def network_cyjs_to_dataframe(network_file='network_v1p0.cyjs'):
import json
with open(network_file) as data_file:
data = json.load(data_file)
nodes = pd.DataFrame(data['elements']['nodes'])
node_data = pd.DataFrame(nodes.data.to_dict()).T
node_position = pd.DataFrame(nodes.position.to_dict()).T
node_data = node_data.join(node_position)
node_data.x = node_data.x.astype(float)
node_data.y = node_data.y.astype(float)
edges = pd.DataFrame(data['elements']['edges'])
edge_data = pd.DataFrame(edges.data.to_dict()).T
edge_data.source = edge_data.source.astype(int)
edge_data.target = edge_data.target.astype(int)
node_data.SUID = node_data.SUID.astype(int)
edge_data = edge_data.merge(node_data[['SUID','x','y']], how='inner', left_on='source', right_on='SUID')
edge_data.rename(columns={'x':'source_x','y':'source_y'},inplace=True)
edge_data = edge_data.merge(node_data[['SUID','x','y']], how='inner', left_on='target', right_on='SUID')
edge_data.rename(columns={'x':'target_x','y':'target_y'},inplace=True)
return (node_data,edge_data)
(nodes,edges) = network_cyjs_to_dataframe()
In [ ]:
nodes.keys()
In [ ]:
nodes.head(10)
In [ ]:
big_data2 = pd.merge(big_data,nodes,on='inchi_key')#.reset_index()
big_data2.head()
In [ ]:
colors = []
segs = []
norm = mpl_colors.Normalize(vmin=edges.weight.min(), vmax=edges.weight.max())
cmap = plt.cm.plasma
my_color = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
for i,row in edges.iterrows():
colors.append(my_color.to_rgba(row.weight))
segs.append(( (row.source_x, row.source_y),
(row.target_x,row.target_y) ))
In [ ]:
fig = plt.figure(figsize=(15,15),facecolor='black')
ax = plt.gca()
myMarker = [0.299324789, 0.960615657, 0.2439362] #'aqua'# #'aqua'
myEdges = [0.126700401, 0.00487433, 0.32941519] #
plt.scatter(nodes['x'],nodes['y'], s=4, c='white', alpha=0.015,lw = 0)
vertices = list(zip(zip(edges.source_x,edges.source_y),zip(edges.target_x,edges.target_y)))
ln_coll = mplib_collections.LineCollection(segs,colors = colors,alpha = 1,linewidth=1)
ax.add_collection(ln_coll)
# fyi: if you don't draw the scatter points then you need to set the axes limits manually
plt.axis('off')
plt.show()
In [ ]:
#map my_piv inchi_key to str.contains() for nodes.
In [ ]:
len(myfiles)
my_piv.shape
In [ ]:
(33,
'pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_5_HighExp_Media_Fungus__1001_A__Run24.csv'),
(58,
'pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_14_HighExp_Media_Control_Conditioned_1001_A__Run18.csv'),
(64,
'pactolus_results_20160629_C18_ACN___POS_MSMS_KBL_MO_Qex_UV_1_LowExp_Media_Fungus__1001_A__Run21.csv'),
In [ ]:
my_piv.keys()
In [ ]:
small_table = my_piv[['Unnamed: 0','precursor intensity']]
small_table = small_table[small_table['precursor intensity'] > 0]
small_table.rename(columns = {'Unnamed: 0':'inchi_key'}, inplace = True)
small_table.drop(0,inplace=True)
small_table.head()
In [ ]:
# my_file = my_piv.columns.get_level_values(1)[0]
# print my_file
# small_table = my_piv['precursor intensity'][my_file].reset_index()
# small_table = small_table[np.isfinite(small_table[my_file])]
small_table = my_piv[['Unnamed: 0','precursor intensity']]
small_table = small_table[small_table['precursor intensity'] > 0]
small_table.rename(columns = {'Unnamed: 0':'inchi_key'}, inplace = True)
small_table.drop(0,inplace=True)
print small_table.shape
# small_table.head()
for i,row in small_table.iterrows():
idx = nodes.inchi_key.str.contains(row.inchi_key)
if sum(idx) > 0:
small_table.loc[i,'x'] = nodes[idx]['x'].tolist()[0]
small_table.loc[i,'y'] = nodes[idx]['y'].tolist()[0]
# pos = small_table.inchi_key.apply(lambda x: nodes[nodes.inchi_key.str.contains(x)][['x','y']])
In [ ]:
small_table.head()
In [ ]:
not_in_network = small_table[~np.isfinite(small_table['x'])]
not_in_network.to_csv('solar_pactolus_hits_not_in_network.csv')
# nodes[nodes.inchi_key.str.contains(small_table.inchi_key.tolist()[0])][['x','y']]
In [ ]:
small_table = small_table[np.isfinite(small_table['x'])]
small_table.shape
In [ ]:
from matplotlib import collections as mplib_collections
import matplotlib as mpl
import matplotlib.cm as cm
fig = plt.figure(figsize=(50,50),facecolor='cornsilk')
ax = plt.gca()
myMarker = [0.299324789, 0.960615657, 0.2439362] #'aqua'# #'aqua'
myEdges = [0.126700401, 0.00487433, 0.32941519] #
plt.scatter(nodes['x'],nodes['y'], s=0.1, c='white', alpha=0.015,lw = 0)
vertices = list(zip(zip(edges.source_x,edges.source_y),zip(edges.target_x,edges.target_y)))
ln_coll = mplib_collections.LineCollection(segs,colors = colors,alpha = 0.251,linewidth=2)
ax.add_collection(ln_coll)
# fyi: if you don't draw the scatter points then you need to set the axes limits manually
norm = mpl.colors.Normalize(vmin=small_table[my_file].min()**0.5, vmax=small_table[my_file].max()**0.5)
cmap = cm.viridis
my_color = cm.ScalarMappable(norm=norm, cmap=cmap)
# plt.scatter(small_table['x'],small_table['y'], s=12, c='white', alpha=1,lw = 0)
plt.scatter(small_table['x'],small_table['y'], s=20, c=my_color.to_rgba(small_table[my_file]**0.5), alpha=1,lw = 0)
# for i,row in small_table.iterrows():
# plt.text(row.x,row.y,row['metatlas name'])
# plt.xlim(-150000,-80000)
# plt.ylim(40000,100000)
# plt.axis('off')
temp= my_file.replace('pactolus_results_','').replace('.xls','')
ax.set_title(temp.replace('.csv',''),color='b',fontsize=14)
plt.show()
fig.savefig('HighExp_Media_Control.pdf',facecolor='cornsilk')
In [ ]:
from matplotlib import collections as mplib_collections
import matplotlib as mpl
import matplotlib.cm as cm
import os
my_groups = files
sub_string = os.path.commonprefix(files)
new_groups = [s.replace(sub_string,'').replace('Set','') for s in my_groups]
norm = mpl.colors.Normalize(vmin=0, vmax=1)
cmap = cm.plasma
my_color = cm.ScalarMappable(norm=norm, cmap=cmap)
#w,h
fig = plt.figure(figsize=(15,30),facecolor='black')
num_files = 2#len(list(set(my_piv.columns.get_level_values(1).tolist())))
# print p.index.get_level_values(0)
# p.xs('precursor intensity',axis=1,level=0)[p.columns.get_level_values(1)[0]]
for iii in range(num_files):
my_file = my_piv.columns.get_level_values(1)[iii]
# print my_file
small_table = my_piv['precursor intensity'][my_file]
# df_all_files.reset_index(inplace=True,drop=True)
# df_all_files.head()
import numpy as np
df = pd.DataFrame(small_table).fillna(0).reset_index()
names = df[df[my_file]>0]['metatlas name'].tolist()
print df.keys()
vals = np.asarray(df[df[my_file]>0][my_file])
idx = np.argwhere(vals > 0).flatten()
temp = vals[idx]
temp = np.log10(temp)
temp = temp - np.min(temp)
temp = temp / np.max(temp)
vals[idx] = temp
M = np.zeros((len(network['node_name'])))
idx = []
not_found = 0
for i,n in enumerate(network['node_name']):
try:
idx = names.index(n)
M[i] = vals[idx]
except:
not_found += 1
# print not_found, i, len(vals)
plt.subplot(2,1,iii+1)
myMarker = [0.299324789, 0.960615657, 0.2439362] #'aqua'# #'aqua'
myEdges = [0.126700401, 0.00487433, 0.32941519] #
idx_z = np.argwhere(M[:] == 0).flatten()
idx_nz = np.argwhere(M[:] > 0.2).flatten()
# plt.plot(network['x'][idx_z],network['y'][idx_z],'o',markersize=1, markeredgecolor='k', markerfacecolor = 'k',alpha=0.3)
plt.scatter(network['x'][idx_nz],network['y'][idx_nz], s=10, c=my_color.to_rgba(M[idx_nz]), alpha=1,lw = 0)
plt.scatter(network['x'][idx_z],network['y'][idx_z], s=1, c='blue', alpha=0.1,lw = 0)
# plt.plot(network['x'][idx_nz],network['y'][idx_nz],'o',markersize=2, markeredgecolor=my_color.to_rgba(M[idx_nz]), markerfacecolor = my_color.to_rgba(M[idx_nz]))
colors = []
segs = []
for e in network['data']['elements']['edges']:
idx1 = np.argwhere(network['node_id'] == float(e['data']['source']))[0][0]
idx2 = np.argwhere(network['node_id'] == float(e['data']['target']))[0][0]
colors.append('grey')
segs.append(( (network['x'][idx1], network['y'][idx1]),
(network['x'][idx2], network['y'][idx2]) ))
ln_coll = mplib_collections.LineCollection(segs, colors=colors,alpha=0.3)
ax = plt.gca()
ax.add_collection(ln_coll)
temp= my_file.replace('pactolus_results_','').replace('.xls','')
a = [g for i,g in enumerate(new_groups) if temp in myfiles[i]]
if 'blank' in my_file:
ax.set_title('blank',color='w')
elif a:
ax.set_title(a[0],color='w')#my_file.replace('pactolus_results_','').replace('.xls','').replace('20150322','').replace('20151124','').replace('_MSMS_','').replace('','').replace('pHILIC_',''),color='w')
else:
ax.set_title(temp,color='w')
# fig.set_facecolor('w')
# ax.set_xlim(0, 600)
# ax.set_ylim(0, 400)
# plt.draw()
plt.axis('off')
plt.show()
fig.savefig(my_file.replace('pactolus_results_','').replace('.xls','')+'_networks_from_pactolus.pdf',facecolor='black')
# plt.close(fig)
In [ ]:
sub_string = os.path.commonprefix([my_piv.columns.get_level_values(1)[iii] for iii in range(39)])
for iii in range(num_files):
my_file = my_piv.columns.get_level_values(1)[iii]
small_table = my_piv['precursor intensity'][my_file]
df = pd.DataFrame(small_table).fillna(0).reset_index()
vals = np.asarray(df[df[my_file]>0][my_file])
idx = np.argwhere(vals > 0.2).flatten()
# temp= my_file.replace('pactolus_results_','').replace('.xls','')
# a = [g for i,g in enumerate(new_groups) if temp in myfiles[i]]
print len(idx),'\t','','\t',my_file.replace(sub_string,'')
In [ ]:
for iii in range(39):
my_file = my_piv.columns.get_level_values(1)[iii]
# print my_file
In [ ]:
In [ ]:
num_files = len(list(set(my_piv.columns.get_level_values(1).tolist())))
In [ ]: